import random
random.seed(42)
import nltk


def tokenize(s):
    """
    Tokenizes the given string, removes digits/punctuations.
    Returns a list of words.
    """
    words = nltk.word_tokenize(s)
    words = [word.lower() for word in words if word.isalpha()]
    return words


def has_numbers(s):
    """
    Returns True if the given string 's' has any digits in it; False otherwise.
    """
    return any(char.isdigit() for char in s)


# def sample_random_sent(txt, N_SAMPLES=1):
#     """
#     Samples a random sentence from the given 'txt'. Does this N_SAMPLES times.
    
#     List contains strings: "1 random sentence" ( > 6 words)
#     Returns a list of length N_SAMPLES.
#     """
#     r1 = []
#     indices_used = []
#     sents = nltk.sent_tokenize(txt)
#     while True:
#         try:
#             i = random.randint(0, len(sents)-1)
#         except: # it does not have any sentences
#             break       
#         if i in indices_used:
#             continue
#         indices_used.append(i)      
#         sent = sents[i]
#         words = tokenize(sent)
#         if len(words) > 6:
#             r1.append(sent)       
#         if len(r1) == N_SAMPLES: # break out if we have the desired number of sentences
#             break
#         if len(indices_used) == len(sents): # break out if we have tried out every sentence
# #             print("Tried every sentID. Returning {} R1 strings (expected = {})".format(len(r1), N_SAMPLES))
#             break      
#     return r1


def sample_5_sents(txt, N_SAMPLES=1, skip_initial_chars=0):
    """
    Samples 5 sentences in a row from the given 'txt'. Does this N_SAMPLES times.
    Skip initial zero characters by default.
    
    List contains strings: "5 random sentences in a row (i.e. sequential)". No length requirements.
    Returns a list of length N_SAMPLES.
    """
    r5 = []
    indices_used = []
    raw_sents = nltk.sent_tokenize(txt[skip_initial_chars:])
    sents = [sent for sent in raw_sents if len(sent.strip()) >= 4] # filter out sents that are < 4 characters (noise)
    while len(r5) < N_SAMPLES:
        try:
            i = random.randint(0, len(sents)-5)
        except: # it does not have 5 sentences
            break            
        if i in indices_used: # to avoid sampling the same sents twice
            continue               
        indices_used.extend([i, i+1, i+2, i+3, i+4])
        r5.append(' '.join(sents[i:i+5]).strip())
        if len(indices_used) >= len(sents)-5: # break out if we have tried out every sentence
#             print("Tried every sentID. Returning {} R5 strings (expected = {})".format(len(r5), N_SAMPLES))
            break
    return r5